import pandas as pd
import urllib
import numpy as np
import urllib.request
import re
from textblob import TextBlob
%run lib.py
#name="Legally%20Blonde"
#name="aboutmary"
#name="10Things"
name="magnolia"
#name="Friday%20The%2013th"
#name="Ghost%20Ship"
#name="Juno"
#name="Reservoir+Dogs"
#name="shawshank"
#name="Sixth%20Sense,%20The"
#name="sunset_bld_3_21_49"
#name="Titanic"
#name="toy_story"
#name="trainspotting"
#name="transformers"
#name="the-truman-show_shooting"
#name="batman_production"
ext="html"
txtfiles=["Ghost%20Ship", "Legally%20Blonde", "Friday%20The%2013th", "Juno", "Reservoir+Dogs", "Sixth%20Sense,%20The", "Titanic"]
if name in txtfiles:
ext="txt"
fp = urllib.request.urlopen("http://www.dailyscript.com/scripts/"+name+"."+ext)
mybytes = fp.read()
mystr = mybytes.decode("utf8", "ignore")
fp.close()
liston=mystr.split("\n")
liston=[s.replace('\r', '') for s in liston]
liston=[re.sub('<[^<]+?>', '', text) for text in liston]
if name=="shawshank":
liston=[i.replace("\t", " ") for i in liston]
char=""
script=[]
charintro=' '
endofdialogue=' '
dialoguepre=' '
newscenepre=' '
charintro=''
endofdialogue=''
dialoguepre=''
newscenepre=''
i=45
print("Characters")
i, charintro=nextbigchunk(liston, i)
print("Adverbs")
i, adverb=nextbigchunk(liston, i, adverbs=True)
print("Dialogues")
i, dialoguepre=nextbigchunk(liston, i)
print("New Scene:")
i, newscenepre=nextbigchunk(liston, i)
if newscenepre=="X":
i=100
i, newscenepre=nextbigchunk(liston, i)
if name=="aboutmary":
newscenepre=" ".join(["" for i in range(56)])
if len(newscenepre)==len(charintro):
newscenepre="X"
endofdialogue=newscenepre
scene=1
for s in liston:
if s[0:len(charintro)]==charintro and s[len(charintro)]!=" " and s.strip()[0]!="(" and s.strip()[len(s.strip())-1]!=")":
#print("Charatcer*****")
char=s[len(charintro):]
new=dict()
new['char']=char.strip()
new['dialogue']=""
new['scene']=scene
new['adverb']=""
if s==endofdialogue or s.replace(" ", "")=="":
if char!="":
char=""
script.append(new)
if char!="" and s[0:len(dialoguepre)]==dialoguepre and s[len(dialoguepre)]!=" ":
#print("Dialogue******")
if new['dialogue']!="":
new['dialogue']=new['dialogue']+" "
new['dialogue']=new['dialogue']+s[len(dialoguepre):]
if char!="" and ((s[0:len(adverb)]==adverb and s[len(adverb)]!=" ") or (len(s)>1 and s.strip()[0]=="(" and s.strip()[len(s.strip())-1]==")" )):
if new['adverb']!="":
new['adverb']=new['adverb']+" "
new['adverb']=new['adverb']+s[len(adverb):]
if s[0:len(newscenepre)]==newscenepre and len(s)>len(newscenepre) and ( s.isupper()) and s[len(newscenepre)]!=" ":
scene=scene+1
pd.DataFrame(script).to_csv(name+'.csv', index=None)
pd.DataFrame(script)
magnolia=pd.read_csv(name+'.csv')
stopwords = getstopwords()
removedchars=["'S VOICE", "'S WHISPER VOICE", " GATOR"]
for s in removedchars:
magnolia['char']=magnolia['char'].apply(lambda x: x.replace(s, ""))
i=0
scenes=dict()
for s in magnolia.iterrows():
scenes[s[1]['scene']]=[]
for s in magnolia.iterrows():
scenes[s[1]['scene']].append(s[1]['char'])
for s in magnolia.iterrows():
scenes[s[1]['scene']]=list(set(scenes[s[1]['scene']]))
characters=[]
for s in scenes:
for k in scenes[s]:
characters.append(k)
characters=list(set(characters))
appearances=dict()
for s in characters:
appearances[s]=0
for s in magnolia.iterrows():
appearances[s[1]['char']]=appearances[s[1]['char']]+1
a=pd.DataFrame(appearances, index=[i for i in range(len(appearances))])
finalcharacters=[]
for s in pd.DataFrame(a.transpose()[0].sort_values(0, ascending=False))[0:10].iterrows():
finalcharacters.append(s[0])
finalcharacters
file=open(name+"_nodes.csv", "w")
couplesappearances=dict()
for s in finalcharacters:
file.write(";")
file.write(s)
file.write("\n")
for s in finalcharacters:
newlist=[]
for f in finalcharacters:
newlist.append(0)
couplesappearances[f+"_"+s]=0
j=0
for f in finalcharacters:
for p in scenes:
if f in scenes[p] and s in scenes[p] and f!=s and finalcharacters.index(f)<finalcharacters.index(s):
long=len(magnolia[magnolia["scene"]==p])
newlist[j]=newlist[j]+long
couplesappearances[f+"_"+s]=couplesappearances[f+"_"+s]+long
j=j+1
file.write(s)
for f in newlist:
file.write(";")
file.write(str(f))
file.write("\n")
file.close()
a=pd.DataFrame(couplesappearances, index=[i for i in range(len(couplesappearances))])
finalcouples=[]
for s in pd.DataFrame(a.transpose()[0].sort_values(0, ascending=False))[0:4].iterrows():
finalcouples.append(s[0])
file=open(name+"_finalcharacters.csv", "w")
for s in finalcharacters:
file.write(s+"\n")
file.close()
file=open(name+"_finalcouples.csv", "w")
for s in finalcouples:
file.write(s+"\n")
file.close()
importantchars=[]
for char in appearances:
if appearances[char]>10:
importantchars.append(char)
file=open(name+"_sentiment_overtime_individual.csv", "w")
file2=open(name+"_sentiment_overtime_individualminsmaxs.csv", "w")
for k in finalcharacters:
print(k)
dd=getdialogue(magnolia, k, k, scenes)
dd=[str(d) for d in dd]
polarities, subjectivities=getsentiment(dd)
%matplotlib inline
import matplotlib.pyplot as plt
moveda=maverage(polarities, dd, .99)
plt.plot(moveda)
i=0
for s in moveda:
file.write(k+","+str(float(i)/len(moveda))+", "+str(s)+"\n")
i=i+1
plt.ylabel('polarities')
plt.show()
file2.write(k+"| MIN| "+dd[moveda.index(np.min(moveda))]+"\n")
file2.write(k+"| MAX| "+dd[moveda.index(np.max(moveda))]+"\n")
print("MIN: "+dd[moveda.index(np.min(moveda))])
print("\n")
print("MAX: "+dd[moveda.index(np.max(moveda))])
file.close()
file2.close()
file=open(name+"_sentiment_overtime_couples.csv", "w")
file2=open(name+"_sentiment_overtime_couplesminsmaxs.csv", "w")
for k in finalcouples:
print(k)
liston=k.split("_")
dd=getdialogue(magnolia, liston[0], liston[1], scenes)
dd=[str(d) for d in dd]
polarities, subjectivities=getsentiment(dd)
%matplotlib inline
import matplotlib.pyplot as plt
moveda=maverage(polarities, dd, .99)
plt.plot(moveda)
i=0
for s in moveda:
file.write(k+","+str(float(i)/len(moveda))+", "+str(s)+"\n")
i=i+1
plt.ylabel('polarities')
plt.show()
file2.write(k+"| MIN| "+dd[moveda.index(np.min(moveda))]+"\n")
file2.write(k+"| MAX| "+dd[moveda.index(np.max(moveda))]+"\n")
print("MIN: "+dd[moveda.index(np.min(moveda))])
print("\n")
print("MAX: "+dd[moveda.index(np.max(moveda))])
file.close()
file2.close()
for key, val in scenes.items():
for s in scenes[key]:
new="INSCENE_"+scenes[key][0]
scenes[key].remove(scenes[key][0])
scenes[key].append(new)
magnolia.dropna(subset=['dialogue'])
1
baskets=[]
spchars=["\"", "'", ".", ",", "-"]
attributes=["?", "!"]
for s in magnolia.iterrows():
if type(s[1]['dialogue'])!=float and len(s[1]['dialogue'])>0:
new=[]
for k in scenes[s[1]['scene']]:
new.append(k)
new.append("SPEAKING_"+s[1]['char'])
for k in s[1]['dialogue'].split(" "):
ko=k
for t in spchars:
ko=ko.replace(t, "")
for t in attributes:
if ko.find(t)>=0:
new.append(t)
ko=ko.replace(t, "")
if len(ko)>0:
new.append(ko.lower())
new=list(set(new))
baskets.append(new)
baskets2=[]
basketslist=[]
for k in baskets:
new=dict()
new2=[]
for t in k:
if t not in stopwords:
new[t]=1
new2.append(t)
baskets2.append(new)
basketslist.append(new2)
baskets2=pd.DataFrame(baskets2)
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
baskets2=baskets2.fillna(0)
baskets2.to_csv(name+'_basket.csv')
frequent_itemsets = apriori(baskets2, min_support=5/len(baskets2), use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules['one_lower']=[int(alllower(i) or alllower(j)) for i, j in zip(rules['antecedants'], rules['consequents'])]
rules['both_lower']=[int(alllower(i) and alllower(j)) for i, j in zip(rules['antecedants'], rules['consequents'])]
rules.to_csv(name+'_rules.csv', index=None)
| . |
|---|
| SICK BOY |
| Palabras Distintas |
|---|
| 1268 |
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 4.576772 | 12.3% |
| sentiment | Porcentaje |
|---|---|
| positive | 50.302% |
| negative | 49.698% |
| sentiment | Porcentaje |
|---|---|
| positive | 17.3% |
| negative | 15.7% |
| trust | 11.7% |
| anticipation | 10.8% |
| joy | 9.4% |
| sadness | 8.5% |
| fear | 7.1% |
| surprise | 6.7% |
| anger | 6.4% |
| disgust | 6.4% |
| sentiment | Porcentaje |
|---|---|
| negative | 43.3% |
| positive | 34.8% |
| uncertainty | 15.4% |
| litigious | 4.5% |
| constraining | 1.5% |
| superfluous | 0.5% |
[1] “Analisis de Sentimientos del Personaje: RENTON” [1] “Numero total de Palabras Unicas en el texto: 494”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 4.904 | 12.1% |
| sentiment | Porcentaje |
|---|---|
| positive | 53.39% |
| negative | 46.61% |
| sentiment | Porcentaje |
|---|---|
| positive | 17.8% |
| negative | 14.8% |
| trust | 11.1% |
| anticipation | 10.4% |
| joy | 9.4% |
| sadness | 8.8% |
| surprise | 7.7% |
| disgust | 7.4% |
| fear | 6.4% |
| anger | 6.1% |
| sentiment | Porcentaje |
|---|---|
| positive | 44.2% |
| negative | 37.2% |
| uncertainty | 18.6% |
[1] “Analisis de Sentimientos del Personaje: SICK BOY” [1] “Numero total de Palabras Unicas en el texto: 469”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 4.8875 | 9.38% |
| sentiment | Porcentaje |
|---|---|
| positive | 56.4% |
| negative | 43.6% |
| sentiment | Porcentaje |
|---|---|
| positive | 20.7% |
| anticipation | 12.7% |
| negative | 12.7% |
| trust | 12.2% |
| joy | 10.8% |
| sadness | 7.0% |
| fear | 6.6% |
| anger | 6.1% |
| surprise | 6.1% |
| disgust | 5.2% |
| sentiment | Porcentaje |
|---|---|
| positive | 56.4% |
| negative | 33.3% |
| uncertainty | 7.7% |
| constraining | 2.6% |
[1] “Analisis de Sentimientos del Personaje: BEGBIE” [1] “Numero total de Palabras Unicas en el texto: 238”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 2.75 | 11.8% |
| sentiment | Porcentaje |
|---|---|
| negative | 78.6% |
| positive | 21.4% |
| sentiment | Porcentaje |
|---|---|
| negative | 25.9% |
| disgust | 14.4% |
| sadness | 12.9% |
| fear | 10.1% |
| anger | 8.6% |
| positive | 7.9% |
| trust | 6.5% |
| surprise | 5.8% |
| joy | 4.3% |
| anticipation | 3.6% |
| sentiment | Porcentaje |
|---|---|
| negative | 60.0% |
| positive | 20.0% |
| litigious | 13.3% |
| uncertainty | 6.7% |
[1] “Analisis de Sentimientos del Personaje: SPUD” [1] “Numero total de Palabras Unicas en el texto: 299”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.521127 | 12.7% |
| sentiment | Porcentaje |
|---|---|
| positive | 71.2% |
| negative | 28.8% |
| sentiment | Porcentaje |
|---|---|
| positive | 16.3% |
| trust | 14.1% |
| negative | 13.3% |
| anticipation | 11.9% |
| joy | 9.6% |
| fear | 8.9% |
| surprise | 8.9% |
| sadness | 7.4% |
| anger | 5.2% |
| disgust | 4.4% |
| sentiment | Porcentaje |
|---|---|
| negative | 34.6% |
| positive | 34.6% |
| uncertainty | 26.9% |
| litigious | 3.8% |
[1] “Analisis de Sentimientos del Personaje: DIANE” [1] “Numero total de Palabras Unicas en el texto: 187”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 6.047619 | 10.2% |
| sentiment | Porcentaje |
|---|---|
| positive | 75.9% |
| negative | 24.1% |
| sentiment | Porcentaje |
|---|---|
| positive | 21.7% |
| negative | 14.1% |
| joy | 13.0% |
| trust | 13.0% |
| surprise | 8.7% |
| fear | 7.6% |
| sadness | 6.5% |
| anger | 5.4% |
| anticipation | 5.4% |
| disgust | 4.3% |
| sentiment | Porcentaje |
|---|---|
| negative | 50.0% |
| positive | 25.0% |
| litigious | 12.5% |
| uncertainty | 12.5% |
[1] “Analisis de Sentimientos del Personaje: TOMMY” [1] “Numero total de Palabras Unicas en el texto: 179”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 4.473684 | 13.4% |
| sentiment | Porcentaje |
|---|---|
| positive | 51.72% |
| negative | 48.28% |
| sentiment | Porcentaje |
|---|---|
| anticipation | 18.1% |
| negative | 16.7% |
| positive | 15.3% |
| trust | 12.5% |
| joy | 11.1% |
| anger | 6.9% |
| sadness | 6.9% |
| surprise | 6.9% |
| disgust | 4.2% |
| fear | 1.4% |
| sentiment | Porcentaje |
|---|---|
| positive | 40% |
| negative | 30% |
| uncertainty | 20% |
| superfluous | 10% |
[1] “Analisis de Sentimientos del Personaje: SWANNEY” [1] “Numero total de Palabras Unicas en el texto: 142”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.153846 | 7.75% |
| sentiment | Porcentaje |
|---|---|
| positive | 57.1% |
| negative | 42.9% |
| sentiment | Porcentaje |
|---|---|
| positive | 24.6% |
| trust | 20.0% |
| anticipation | 13.8% |
| joy | 10.8% |
| fear | 7.7% |
| negative | 7.7% |
| surprise | 6.2% |
| sadness | 4.6% |
| anger | 3.1% |
| disgust | 1.5% |
| sentiment | Porcentaje |
|---|---|
| negative | 50.0% |
| constraining | 16.7% |
| litigious | 16.7% |
| uncertainty | 16.7% |
[1] “Analisis de Sentimientos del Personaje: MOTHER” [1] “Numero total de Palabras Unicas en el texto: 112”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 4.125 | 7.14% |
| sentiment | Porcentaje |
|---|---|
| negative | 77.8% |
| positive | 22.2% |
| sentiment | Porcentaje |
|---|---|
| negative | 20.0% |
| sadness | 17.1% |
| fear | 14.3% |
| positive | 14.3% |
| anticipation | 8.6% |
| joy | 8.6% |
| trust | 8.6% |
| anger | 2.9% |
| disgust | 2.9% |
| surprise | 2.9% |
| sentiment | Porcentaje |
|---|---|
| negative | 85.7% |
| positive | 14.3% |
[1] “Analisis de Sentimientos del Personaje: GAIL” [1] “Numero total de Palabras Unicas en el texto: 65”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 4.375 | 12.3% |
| sentiment | Porcentaje |
|---|---|
| negative | 55.6% |
| positive | 44.4% |
| sentiment | Porcentaje |
|---|---|
| negative | 19.2% |
| positive | 19.2% |
| anticipation | 15.4% |
| joy | 15.4% |
| trust | 15.4% |
| fear | 7.7% |
| sadness | 3.8% |
| surprise | 3.8% |
| sentiment | Porcentaje |
|---|---|
| negative | 33.3% |
| positive | 33.3% |
| uncertainty | 33.3% |
[1] “Analisis de Sentimientos del Personaje: MAN” [1] “Numero total de Palabras Unicas en el texto: 92”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 4.727273 | 7.61% |
| sentiment | Porcentaje |
|---|---|
| negative | 53.33% |
| positive | 46.67% |
| sentiment | Porcentaje |
|---|---|
| positive | 52.9% |
| trust | 29.4% |
| joy | 11.8% |
| sadness | 5.9% |
| sentiment | Porcentaje |
|---|---|
| positive | 100% |
| Personaje | Min_Max | Dialogo |
|---|---|---|
| RENTON | MIN | I want a fucking hit. |
| RENTON | MAX | Sounds great, Swanney. |
| SICK BOY | MIN | Fuck you. OK, so Tommy’s got the virus. Bad news, big deal. The gig goes on, or hadn’t you noticed? Swanney fucks his leg up. Well, tough shit, but it could have been worse. |
| SICK BOY | MAX | No, it’s not bad, but it’s not great either, is it? And in your heart you kind of know that although it sounds all right, it’s actually just shite. |
| BEGBIE | MIN | Because I fucking told you to do that, you doss cunt. |
| BEGBIE | MAX | I’m no a fucking buftie and that’s the end of it. |
| SPUD | MIN | A little dab of speed is just the ticket. |
| SPUD | MAX | The pleasure was mine. Best interview I’ve ever been to. Thanks. |
| DIANE | MIN | Shut up. |
| DIANE | MAX | It’s where I live. |
| TOMMY | MIN | Well, what are you waiting for? |
| TOMMY | MAX | Thanks, Mark. |
| SWANNEY | MIN | Well, it’s up to you. |
| SWANNEY | MAX | you’ll need one more hit. |
| MOTHER | MIN | No problem for me either. Honestly, it’s no problem. |
| MOTHER | MAX | No. No clinics, no methadone. That made you worse, you said so yourself. You lied to us, son, your own mother and father. |
| GAIL | MIN | Not much. |
| GAIL | MAX | It’s all right. I slept fine on the sofa. |
| MAN | MIN | And who the fuck do you think you are? |
| MAN | MAX | But it’s not worth more than fifteen. |
| Parejas | Min_Max | Dialogo |
|---|---|---|
| RENTON_SICK BOY | MIN | And I got a stitch stuck between my teeth, jerked my head back and the whole fucking stump fell off. |
| RENTON_SICK BOY | MAX | Despite the Academy award? |
| SICK BOY_BEGBIE | MIN | Because I fucking told you to do that, you doss cunt. |
| SICK BOY_BEGBIE | MAX | We’re two thousand short. |
| RENTON_BEGBIE | MIN | You’re not going to and fucking hospital. You’re staying there. And you bring me a fucking cigarette. |
| RENTON_BEGBIE | MAX | But you don’t have the money? |
| RENTON_SPUD | MIN | Sorry, boys, I don’t have two thousand pounds. |
| RENTON_SPUD | MAX | Right. |
## [1] "Lift Promedio de las Reglas de Asociacion: 8.02944052745034"
## [1] "Desviación estandar del Lift de las Reglas de Asociacion: 9.87228101321947"
## [1] "Deciles del Lift : "
## 10% 20% 30% 40% 50% 60% 70%
## 1.363388 1.851577 2.434146 3.205523 3.929134 5.280423 7.920635
## 80% 90% 100%
## 10.915625 21.695652 83.166667
| Numero de Dialogos | Lift Minimo | Lift Maximo |
|---|---|---|
| 1,386 | -1 | 1 |
| 4,064 | 1 | 4 |
| 1,576 | 4 | 7 |
| 510 | 7 | 10 |
| 948 | 10 | 13 |
| 118 | 13 | 16 |
## [1] "Leverage Promedio de las Reglas de Asociacion: 0.01313658802416"
## [1] "Desviación estandar del Leverage de las Reglas de Asociacion: 0.0103336215657906"
## [1] "Deciles del Leverage : "
## 10% 20% 30% 40% 50% 60%
## 0.003764242 0.006381500 0.007634507 0.008995948 0.009718836 0.011534090
## 70% 80% 90% 100%
## 0.013985486 0.021457745 0.024851306 0.103859021
| Numero de Dialogos | Leverage Minimo | Leverage Maximo |
|---|---|---|
| 314 | -0.0018 | 0.0018 |
| 1,216 | 0.0018 | 0.0054 |
| 2,494 | 0.0054 | 0.009 |
| 2,494 | 0.009 | 0.013 |
| 1,044 | 0.013 | 0.016 |
| 472 | 0.016 | 0.02 |
Pagerank: Reservoir Dogs.